HateCrime_df=
read_csv("./data/HateCrimes.csv")
## Parsed with column specification:
## cols(
## state = col_character(),
## unemployment = col_character(),
## urbanization = col_character(),
## median_household_income = col_double(),
## perc_population_with_high_school_degree = col_double(),
## perc_non_citizen = col_double(),
## gini_index = col_double(),
## perc_non_white = col_double(),
## hate_crimes_per_100k_splc = col_character()
## )
#Noting we do not have hatecrime stats for Hawaii,North Dakota,South Dakota,and Wyoming
For analysis purpose, I delelte rows that do not have hate-crime record, and convert hate crime data to numerical value
HateCrime_df =
HateCrime_df %>%
filter(hate_crimes_per_100k_splc != "N/A") %>%
na.omit() %>%
mutate(hate_crimes_per_100k_splc = as.numeric(hate_crimes_per_100k_splc))
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#HateCrime_df %>%
# filter(hate_crimes_per_100k_splc != "N/A") %>%
# mutate(hate_crimes_per_100k_splc = as.numeric(hate_crimes_per_100k_splc)) %>%
# ggplot(aes(x=hate_crimes_per_100k_splc))+
# geom_boxplot()+
# coord_flip()
HateCrime_df %>%
mutate(text_label = str_c(" hate crime rate per 100,000 population: ", hate_crimes_per_100k_splc, "\nState: ", state)) %>%
plot_ly(y = ~hate_crimes_per_100k_splc, type = "box", colors = "viridis",text = ~text_label)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
HateCrime_df %>%
mutate(state = fct_reorder(state, hate_crimes_per_100k_splc)) %>%
ggplot(aes(x=hate_crimes_per_100k_splc)) +
geom_histogram(fill="darkorange2") +
labs(x = "hate crime rate per 100,000 population", y = "frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# plot_ly(y = ~price, color = ~neighbourhood, type = "box", colors = "viridis")
HateCrime_df %>%
mutate(state = fct_reorder(state, hate_crimes_per_100k_splc)) %>%
ggplot(aes(x=state,y=hate_crimes_per_100k_splc,fill=state)) +
geom_bar( stat = "identity")+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(x = "state", y = "hate crime rate per 100,000 population")+
theme(legend.position = "none")
summary(HateCrime_df$hate_crimes_per_100k_splc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.06906 0.14374 0.22620 0.30243 0.35062 1.52230
log=
HateCrime_df %>%
mutate(state = fct_reorder(state, hate_crimes_per_100k_splc),
hate_crimes_per_100k_splc = log(hate_crimes_per_100k_splc) ) %>%
ggplot(aes(x=hate_crimes_per_100k_splc)) +
geom_histogram(fill="firebrick4") +
labs(title= "log transformation",x = "log(hate crime rate per 100,000 population)", y = "frequency")
squre_root=
HateCrime_df %>%
mutate(state = fct_reorder(state, hate_crimes_per_100k_splc),
hate_crimes_per_100k_splc = (hate_crimes_per_100k_splc)^(0.5) ) %>%
ggplot(aes(x=hate_crimes_per_100k_splc)) +
geom_histogram(fill="firebrick4") +
labs(title= "square root transformation",x = "sqrt(hate crime rate per 100,000 population)", y = "frequency")
cube_root=
HateCrime_df %>%
mutate(state = fct_reorder(state, hate_crimes_per_100k_splc),
hate_crimes_per_100k_splc = (hate_crimes_per_100k_splc)^(1/3) ) %>%
ggplot(aes(x=hate_crimes_per_100k_splc)) +
geom_histogram(fill="firebrick4") +
labs(title= "cube root transformation",x = "cube_root(hate crime rate per 100,000 population)", y = "frequency")
(log+squre_root)/cube_root
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
HateCrime_df%>%
select(-state,-unemployment,-urbanization) %>%
pairs()
HateCrime_df%>%
select(-state,-unemployment,-urbanization) %>%
cor()
## median_household_income
## median_household_income 1.00000000
## perc_population_with_high_school_degree 0.65113832
## perc_non_citizen 0.30173941
## gini_index -0.12952158
## perc_non_white 0.03905399
## hate_crimes_per_100k_splc 0.34378921
## perc_population_with_high_school_degree
## median_household_income 0.6511383
## perc_population_with_high_school_degree 1.0000000
## perc_non_citizen -0.2621288
## gini_index -0.5371591
## perc_non_white -0.4958932
## hate_crimes_per_100k_splc 0.2628198
## perc_non_citizen gini_index
## median_household_income 0.3017394 -0.1295216
## perc_population_with_high_school_degree -0.2621288 -0.5371591
## perc_non_citizen 1.0000000 0.4798976
## gini_index 0.4798976 1.0000000
## perc_non_white 0.7526102 0.5484035
## hate_crimes_per_100k_splc 0.2435066 0.3805028
## perc_non_white
## median_household_income 0.03905399
## perc_population_with_high_school_degree -0.49589321
## perc_non_citizen 0.75261020
## gini_index 0.54840351
## perc_non_white 1.00000000
## hate_crimes_per_100k_splc 0.11116503
## hate_crimes_per_100k_splc
## median_household_income 0.3437892
## perc_population_with_high_school_degree 0.2628198
## perc_non_citizen 0.2435066
## gini_index 0.3805028
## perc_non_white 0.1111650
## hate_crimes_per_100k_splc 1.0000000
percent noncitizen population and percent nonwhite population have have correlation of -.7526. Similarly, another high correlation of 0.65 is observed between median household income and percent of adults 25 and older with at least a high school degreemult_fit <- lm(hate_crimes_per_100k_splc ~ median_household_income + perc_non_white + perc_non_citizen + perc_population_with_high_school_degree + gini_index + unemployment + urbanization, data=HateCrime_df)
summary(mult_fit)
##
## Call:
## lm(formula = hate_crimes_per_100k_splc ~ median_household_income +
## perc_non_white + perc_non_citizen + perc_population_with_high_school_degree +
## gini_index + unemployment + urbanization, data = HateCrime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36552 -0.10314 -0.01316 0.09731 0.51389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.296e+00 1.908e+00 -4.349 0.000103
## median_household_income -1.504e-06 5.961e-06 -0.252 0.802193
## perc_non_white -5.842e-03 3.673e-01 -0.016 0.987396
## perc_non_citizen 1.233e+00 1.877e+00 0.657 0.515332
## perc_population_with_high_school_degree 5.382e+00 1.835e+00 2.933 0.005735
## gini_index 8.624e+00 1.973e+00 4.370 9.67e-05
## unemploymentlow 1.307e-02 7.173e-02 0.182 0.856425
## urbanizationlow 3.309e-02 8.475e-02 0.390 0.698475
##
## (Intercept) ***
## median_household_income
## perc_non_white
## perc_non_citizen
## perc_population_with_high_school_degree **
## gini_index ***
## unemploymentlow
## urbanizationlow
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2014 on 37 degrees of freedom
## Multiple R-squared: 0.461, Adjusted R-squared: 0.3591
## F-statistic: 4.521 on 7 and 37 DF, p-value: 0.001007
boxcox(mult_fit)
- From Boxcox we also see the lambda value is close to 0 which means a log-transformation is suggested in this case.
par(mfrow=c(2,2))
plot(mult_fit)
mult_fit_log <- lm(log(hate_crimes_per_100k_splc) ~ median_household_income + perc_non_white + perc_non_citizen + perc_population_with_high_school_degree + gini_index + unemployment + urbanization, data=HateCrime_df)
summary(mult_fit_log)
##
## Call:
## lm(formula = log(hate_crimes_per_100k_splc) ~ median_household_income +
## perc_non_white + perc_non_citizen + perc_population_with_high_school_degree +
## gini_index + unemployment + urbanization, data = HateCrime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28845 -0.41144 0.01898 0.31334 1.13022
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.857e+01 5.553e+00 -3.344 0.00190
## median_household_income -4.732e-06 1.735e-05 -0.273 0.78658
## perc_non_white -1.232e-01 1.069e+00 -0.115 0.90887
## perc_non_citizen 1.168e+00 5.464e+00 0.214 0.83189
## perc_population_with_high_school_degree 1.121e+01 5.341e+00 2.098 0.04275
## gini_index 1.670e+01 5.744e+00 2.908 0.00611
## unemploymentlow 2.179e-01 2.088e-01 1.043 0.30353
## urbanizationlow -9.885e-02 2.467e-01 -0.401 0.69092
##
## (Intercept) **
## median_household_income
## perc_non_white
## perc_non_citizen
## perc_population_with_high_school_degree *
## gini_index **
## unemploymentlow
## urbanizationlow
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5862 on 37 degrees of freedom
## Multiple R-squared: 0.3146, Adjusted R-squared: 0.1849
## F-statistic: 2.426 on 7 and 37 DF, p-value: 0.03768
par(mfrow=c(2,2))
plot(mult_fit_log)
residual versus fitted plot and sqrt(standardized residual) versus fitted value plot, the horizontal band is also closer to zero in the residuals vs fitted value plot, which means log transformation does enable us to have more equally spread error variance. Log transformation also reduce the presence of heavy tails in QQplot in which indicates residuals seem to follow normal distribution in the current model.Finally, log transformation reduce the effect of influence point as indicate in the Residuals vs Leverage plot. Before transformation, one point is outside of the dashed line 1, which means we have an influential point that have cook’s distance larger than 1. After log transformation, all observation are within the dashed line meaning no influential point is shown in the current model.step(mult_fit_log, direction='backward')
## Start: AIC=-40.88
## log(hate_crimes_per_100k_splc) ~ median_household_income + perc_non_white +
## perc_non_citizen + perc_population_with_high_school_degree +
## gini_index + unemployment + urbanization
##
## Df Sum of Sq RSS AIC
## - perc_non_white 1 0.00456 12.719 -42.859
## - perc_non_citizen 1 0.01570 12.730 -42.820
## - median_household_income 1 0.02556 12.740 -42.785
## - urbanization 1 0.05519 12.770 -42.680
## - unemployment 1 0.37413 13.089 -41.570
## <none> 12.715 -40.875
## - perc_population_with_high_school_degree 1 1.51318 14.228 -37.815
## - gini_index 1 2.90660 15.621 -33.611
##
## Step: AIC=-42.86
## log(hate_crimes_per_100k_splc) ~ median_household_income + perc_non_citizen +
## perc_population_with_high_school_degree + gini_index + unemployment +
## urbanization
##
## Df Sum of Sq RSS AIC
## - perc_non_citizen 1 0.01114 12.730 -44.820
## - median_household_income 1 0.02946 12.749 -44.755
## - urbanization 1 0.05718 12.777 -44.657
## - unemployment 1 0.41699 13.136 -43.408
## <none> 12.719 -42.859
## - perc_population_with_high_school_degree 1 1.73309 14.452 -39.111
## - gini_index 1 2.90620 15.626 -35.599
##
## Step: AIC=-44.82
## log(hate_crimes_per_100k_splc) ~ median_household_income + perc_population_with_high_school_degree +
## gini_index + unemployment + urbanization
##
## Df Sum of Sq RSS AIC
## - median_household_income 1 0.01910 12.750 -46.752
## - urbanization 1 0.11092 12.841 -46.429
## - unemployment 1 0.41466 13.145 -45.377
## <none> 12.730 -44.820
## - perc_population_with_high_school_degree 1 1.92883 14.659 -40.471
## - gini_index 1 3.00737 15.738 -37.277
##
## Step: AIC=-46.75
## log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree +
## gini_index + unemployment + urbanization
##
## Df Sum of Sq RSS AIC
## - urbanization 1 0.09183 12.841 -48.429
## - unemployment 1 0.40424 13.154 -47.348
## <none> 12.750 -46.752
## - gini_index 1 3.02492 15.774 -39.172
## - perc_population_with_high_school_degree 1 3.15688 15.906 -38.797
##
## Step: AIC=-48.43
## log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree +
## gini_index + unemployment
##
## Df Sum of Sq RSS AIC
## - unemployment 1 0.3655 13.207 -49.166
## <none> 12.841 -48.429
## - perc_population_with_high_school_degree 1 3.3459 16.187 -40.010
## - gini_index 1 4.0555 16.897 -38.079
##
## Step: AIC=-49.17
## log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree +
## gini_index
##
## Df Sum of Sq RSS AIC
## <none> 13.207 -49.166
## - gini_index 1 3.7171 16.924 -40.007
## - perc_population_with_high_school_degree 1 4.4569 17.664 -38.081
##
## Call:
## lm(formula = log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree +
## gini_index, data = HateCrime_df)
##
## Coefficients:
## (Intercept)
## -18.95
## perc_population_with_high_school_degree
## 11.55
## gini_index
## 16.49
stepwised_multi_fit = lm( log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree +
gini_index, data = HateCrime_df)
summary(stepwised_multi_fit)
##
## Call:
## lm(formula = log(hate_crimes_per_100k_splc) ~ perc_population_with_high_school_degree +
## gini_index, data = HateCrime_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.34787 -0.39659 -0.00387 0.44892 1.06723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -18.947 4.254 -4.454 6.14e-05
## perc_population_with_high_school_degree 11.554 3.069 3.765 0.000512
## gini_index 16.486 4.795 3.438 0.001334
##
## (Intercept) ***
## perc_population_with_high_school_degree ***
## gini_index **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5608 on 42 degrees of freedom
## Multiple R-squared: 0.288, Adjusted R-squared: 0.2541
## F-statistic: 8.496 on 2 and 42 DF, p-value: 0.0007974